Importing libraries¶

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import seaborn as sns
import calendar
import warnings
warnings.filterwarnings('ignore')

Week1¶

Data preprocessing & cleaning¶

In [2]:
df = pd.read_csv('Supply_chain.csv')
In [3]:
df.head()
Out[3]:
Product type SKU Price Availability Number of products sold Revenue generated Customer demographics Stock levels Lead times Order quantities ... Lead time Production volumes Manufacturing lead time Manufacturing costs Inspection results Defect rates Transportation modes Routes Costs Date
0 fragrance Woody Eau de Parfum 1.699976 39 868 9866.465458 Non-binary 57 10 52 ... 6 656 3 30.288965 Fail 8.923234 Rail Route E 384.399820 01-01-18 9:20
1 healthcare Multivitamin Tablets 75.202244 87 809 8920.095776 Unknown 58 28 47 ... 23 975 21 40.798640 Pass 1.547675 Rail Route B 875.980893 01-01-18 17:47
2 bodycare Exfoliating Soap 22.916503 90 832 6777.144443 Female 59 33 69 ... 30 296 22 70.226789 Pending 3.216068 Air Route C 674.232825 02-01-18 0:50
3 bodycare Coconut Body Butter 80.065417 75 167 5711.616781 Male 0 32 44 ... 26 529 25 97.717130 Pass 1.449842 Road Route E 723.878925 02-01-18 1:18
4 bodycare Exfoliating Soap 74.364683 100 400 7338.061362 Male 31 32 30 ... 25 985 6 52.792169 Fail 2.068435 Air Route J 103.916248 02-01-18 2:35

5 rows × 25 columns

In [4]:
No_Rows = df.shape[0]
No_Columns = df.shape[1]

print("No_Rows =", No_Rows, "\nNo_Columns =", No_Columns)
No_Rows = 10000 
No_Columns = 25
In [5]:
df.columns.to_list()
Out[5]:
['Product type',
 'SKU',
 'Price',
 'Availability',
 'Number of products sold',
 'Revenue generated',
 'Customer demographics',
 'Stock levels',
 'Lead times',
 'Order quantities',
 'Shipping times',
 'Shipping carriers',
 'Shipping costs',
 'Supplier name',
 'Location',
 'Lead time',
 'Production volumes',
 'Manufacturing lead time',
 'Manufacturing costs',
 'Inspection results',
 'Defect rates',
 'Transportation modes',
 'Routes',
 'Costs',
 'Date']
In [6]:
df.dtypes
Out[6]:
Product type                object
SKU                         object
Price                      float64
Availability                 int64
Number of products sold      int64
Revenue generated          float64
Customer demographics       object
Stock levels                 int64
Lead times                   int64
Order quantities             int64
Shipping times               int64
Shipping carriers           object
Shipping costs             float64
Supplier name               object
Location                    object
Lead time                    int64
Production volumes           int64
Manufacturing lead time      int64
Manufacturing costs        float64
Inspection results          object
Defect rates               float64
Transportation modes        object
Routes                      object
Costs                      float64
Date                        object
dtype: object
In [7]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Product type             10000 non-null  object 
 1   SKU                      10000 non-null  object 
 2   Price                    10000 non-null  float64
 3   Availability             10000 non-null  int64  
 4   Number of products sold  10000 non-null  int64  
 5   Revenue generated        10000 non-null  float64
 6   Customer demographics    10000 non-null  object 
 7   Stock levels             10000 non-null  int64  
 8   Lead times               10000 non-null  int64  
 9   Order quantities         10000 non-null  int64  
 10  Shipping times           10000 non-null  int64  
 11  Shipping carriers        10000 non-null  object 
 12  Shipping costs           10000 non-null  float64
 13  Supplier name            10000 non-null  object 
 14  Location                 10000 non-null  object 
 15  Lead time                10000 non-null  int64  
 16  Production volumes       10000 non-null  int64  
 17  Manufacturing lead time  10000 non-null  int64  
 18  Manufacturing costs      10000 non-null  float64
 19  Inspection results       10000 non-null  object 
 20  Defect rates             10000 non-null  float64
 21  Transportation modes     10000 non-null  object 
 22  Routes                   10000 non-null  object 
 23  Costs                    10000 non-null  float64
 24  Date                     10000 non-null  object 
dtypes: float64(6), int64(9), object(10)
memory usage: 1.9+ MB
In [8]:
df.describe().T
Out[8]:
count mean std min 25% 50% 75% max
Price 10000.0 49.187991 27.985300 1.699976 27.893450 49.021973 70.393824 99.171329
Availability 10000.0 48.280300 27.954661 1.000000 27.000000 48.000000 69.000000 100.000000
Number of products sold 10000.0 466.971700 274.888689 8.000000 258.000000 465.000000 663.000000 996.000000
Revenue generated 10000.0 22301.711221 20417.820954 12.924508 5824.249530 17068.018842 33482.836051 103648.261247
Stock levels 10000.0 47.780600 28.214696 0.000000 26.000000 48.000000 69.000000 100.000000
Lead times 10000.0 27.956500 7.243083 2.000000 23.000000 28.000000 34.000000 47.000000
Order quantities 10000.0 49.070300 25.203331 1.000000 31.000000 49.000000 67.000000 96.000000
Shipping times 10000.0 5.732300 2.492761 1.000000 4.000000 6.000000 8.000000 10.000000
Shipping costs 10000.0 5.551080 2.451311 1.013487 3.774130 5.520123 7.365787 9.929816
Lead time 10000.0 21.371200 6.651664 1.000000 17.000000 22.000000 27.000000 33.000000
Production volumes 10000.0 569.339100 241.097196 104.000000 392.000000 570.000000 749.000000 985.000000
Manufacturing lead time 10000.0 14.835900 8.158077 1.000000 9.000000 15.000000 21.000000 30.000000
Manufacturing costs 10000.0 47.498517 26.472004 1.085069 27.853994 47.371074 66.662905 99.466109
Defect rates 10000.0 3.128166 2.204568 0.500542 1.497578 2.636311 3.756214 9.985117
Costs 10000.0 72.680371 108.749141 2.058718 34.102315 54.714348 75.298942 997.413450
In [9]:
# Check categorical stats
df.describe(include='O')
Out[9]:
Product type SKU Customer demographics Shipping carriers Supplier name Location Inspection results Transportation modes Routes Date
count 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000
unique 10 105 4 5 5 8 3 4 10 9987
top healthcare Woody Eau de Parfum Female FedEx Supplier D Hyderabad Pass Air Route B 07-02-20 14:11
freq 1091 118 2546 2070 2032 1315 3385 2559 1052 2
In [10]:
for c in df.select_dtypes(include=["object", "string"]).columns:
    df[c] = df[c].astype("string").str.replace(r"\s+", " ", regex=True).str.strip()
    df[c] = df[c].replace({"": pd.NA})
In [11]:
df.head()
Out[11]:
Product type SKU Price Availability Number of products sold Revenue generated Customer demographics Stock levels Lead times Order quantities ... Lead time Production volumes Manufacturing lead time Manufacturing costs Inspection results Defect rates Transportation modes Routes Costs Date
0 fragrance Woody Eau de Parfum 1.699976 39 868 9866.465458 Non-binary 57 10 52 ... 6 656 3 30.288965 Fail 8.923234 Rail Route E 384.399820 01-01-18 9:20
1 healthcare Multivitamin Tablets 75.202244 87 809 8920.095776 Unknown 58 28 47 ... 23 975 21 40.798640 Pass 1.547675 Rail Route B 875.980893 01-01-18 17:47
2 bodycare Exfoliating Soap 22.916503 90 832 6777.144443 Female 59 33 69 ... 30 296 22 70.226789 Pending 3.216068 Air Route C 674.232825 02-01-18 0:50
3 bodycare Coconut Body Butter 80.065417 75 167 5711.616781 Male 0 32 44 ... 26 529 25 97.717130 Pass 1.449842 Road Route E 723.878925 02-01-18 1:18
4 bodycare Exfoliating Soap 74.364683 100 400 7338.061362 Male 31 32 30 ... 25 985 6 52.792169 Fail 2.068435 Air Route J 103.916248 02-01-18 2:35

5 rows × 25 columns

In [12]:
current_columns = [
 'Product type',
 'SKU',
 'Price',
 'Availability',
 'Number of products sold',
 'Revenue generated',
 'Customer demographics',
 'Stock levels',
 'Lead times',
 'Order quantities',
 'Shipping times',
 'Shipping carriers',
 'Shipping costs',
 'Supplier name',
 'Location',
 'Lead time',
 'Production volumes',
 'Manufacturing lead time',
 'Manufacturing costs',
 'Inspection results',
 'Defect rates',
 'Transportation modes',
 'Routes',
 'Costs',
 'Date'
]



new_column_names_map = {col: col.replace(' ', '_') for col in current_columns}


df.rename(columns=new_column_names_map, inplace=True)
In [13]:
df.head()
Out[13]:
Product_type SKU Price Availability Number_of_products_sold Revenue_generated Customer_demographics Stock_levels Lead_times Order_quantities ... Lead_time Production_volumes Manufacturing_lead_time Manufacturing_costs Inspection_results Defect_rates Transportation_modes Routes Costs Date
0 fragrance Woody Eau de Parfum 1.699976 39 868 9866.465458 Non-binary 57 10 52 ... 6 656 3 30.288965 Fail 8.923234 Rail Route E 384.399820 01-01-18 9:20
1 healthcare Multivitamin Tablets 75.202244 87 809 8920.095776 Unknown 58 28 47 ... 23 975 21 40.798640 Pass 1.547675 Rail Route B 875.980893 01-01-18 17:47
2 bodycare Exfoliating Soap 22.916503 90 832 6777.144443 Female 59 33 69 ... 30 296 22 70.226789 Pending 3.216068 Air Route C 674.232825 02-01-18 0:50
3 bodycare Coconut Body Butter 80.065417 75 167 5711.616781 Male 0 32 44 ... 26 529 25 97.717130 Pass 1.449842 Road Route E 723.878925 02-01-18 1:18
4 bodycare Exfoliating Soap 74.364683 100 400 7338.061362 Male 31 32 30 ... 25 985 6 52.792169 Fail 2.068435 Air Route J 103.916248 02-01-18 2:35

5 rows × 25 columns

In [14]:
df = df.dropna(how="all")  
df = df.reset_index(drop=True)
In [15]:
No_Rows = df.shape[0]
No_Columns = df.shape[1]

print("No_Rows =", No_Rows, "\nNo_Columns =", No_Columns)
No_Rows = 10000 
No_Columns = 25
In [16]:
df.dtypes.to_frame("dtype")
Out[16]:
dtype
Product_type string[python]
SKU string[python]
Price float64
Availability int64
Number_of_products_sold int64
Revenue_generated float64
Customer_demographics string[python]
Stock_levels int64
Lead_times int64
Order_quantities int64
Shipping_times int64
Shipping_carriers string[python]
Shipping_costs float64
Supplier_name string[python]
Location string[python]
Lead_time int64
Production_volumes int64
Manufacturing_lead_time int64
Manufacturing_costs float64
Inspection_results string[python]
Defect_rates float64
Transportation_modes string[python]
Routes string[python]
Costs float64
Date string[python]
In [17]:
num_cols = df.select_dtypes(include="number").columns
neg_counts = {c: int((df[c] < 0).sum()) for c in num_cols}
pd.Series(neg_counts, name="negative_values_count")
Out[17]:
Price                      0
Availability               0
Number_of_products_sold    0
Revenue_generated          0
Stock_levels               0
Lead_times                 0
Order_quantities           0
Shipping_times             0
Shipping_costs             0
Lead_time                  0
Production_volumes         0
Manufacturing_lead_time    0
Manufacturing_costs        0
Defect_rates               0
Costs                      0
Name: negative_values_count, dtype: int64
In [18]:
nulls = df.isna().sum().sort_values(ascending=False)
nulls.to_frame("nulls_count")
Out[18]:
nulls_count
Product_type 0
SKU 0
Price 0
Availability 0
Number_of_products_sold 0
Revenue_generated 0
Customer_demographics 0
Stock_levels 0
Lead_times 0
Order_quantities 0
Shipping_times 0
Shipping_carriers 0
Shipping_costs 0
Supplier_name 0
Location 0
Lead_time 0
Production_volumes 0
Manufacturing_lead_time 0
Manufacturing_costs 0
Inspection_results 0
Defect_rates 0
Transportation_modes 0
Routes 0
Costs 0
Date 0
In [19]:
n_dups = int(df.duplicated().sum())
dups_preview = df[df.duplicated(keep=False)].head(10)
print("NO.Duplicated:", n_dups)
dups_preview.T
NO.Duplicated: 0
Out[19]:
Product_type
SKU
Price
Availability
Number_of_products_sold
Revenue_generated
Customer_demographics
Stock_levels
Lead_times
Order_quantities
Shipping_times
Shipping_carriers
Shipping_costs
Supplier_name
Location
Lead_time
Production_volumes
Manufacturing_lead_time
Manufacturing_costs
Inspection_results
Defect_rates
Transportation_modes
Routes
Costs
Date
In [20]:
Date = [c for c in df.columns if "date" in c.lower()]

for c in Date:
    df[c] = pd.to_datetime(df[c], errors="coerce", dayfirst=True)

df[Date].head()
Out[20]:
Date
0 2018-01-01 09:20:00
1 2018-01-01 17:47:00
2 2018-01-02 00:50:00
3 2018-01-02 01:18:00
4 2018-01-02 02:35:00
In [21]:
# خلية 12: عرض التاريخ فقط بدون الوقت (YYYY-MM-DD)
for c in Date:
    # نحول القيمة إلى التاريخ فقط
    df[c] = df[c].dt.date

df[Date].head()
Out[21]:
Date
0 2018-01-01
1 2018-01-01
2 2018-01-02
3 2018-01-02
4 2018-01-02
In [22]:
df.head()
Out[22]:
Product_type SKU Price Availability Number_of_products_sold Revenue_generated Customer_demographics Stock_levels Lead_times Order_quantities ... Lead_time Production_volumes Manufacturing_lead_time Manufacturing_costs Inspection_results Defect_rates Transportation_modes Routes Costs Date
0 fragrance Woody Eau de Parfum 1.699976 39 868 9866.465458 Non-binary 57 10 52 ... 6 656 3 30.288965 Fail 8.923234 Rail Route E 384.399820 2018-01-01
1 healthcare Multivitamin Tablets 75.202244 87 809 8920.095776 Unknown 58 28 47 ... 23 975 21 40.798640 Pass 1.547675 Rail Route B 875.980893 2018-01-01
2 bodycare Exfoliating Soap 22.916503 90 832 6777.144443 Female 59 33 69 ... 30 296 22 70.226789 Pending 3.216068 Air Route C 674.232825 2018-01-02
3 bodycare Coconut Body Butter 80.065417 75 167 5711.616781 Male 0 32 44 ... 26 529 25 97.717130 Pass 1.449842 Road Route E 723.878925 2018-01-02
4 bodycare Exfoliating Soap 74.364683 100 400 7338.061362 Male 31 32 30 ... 25 985 6 52.792169 Fail 2.068435 Air Route J 103.916248 2018-01-02

5 rows × 25 columns

In [23]:
df["Expected_Revenue"] = df["Price"] * df["Number_of_products_sold"]
df["Expected_Costs"] = df["Manufacturing_costs"] + df["Shipping_costs"]
In [24]:
df["Revenue_Deviation"] = ((df["Revenue_generated"] - df["Expected_Revenue"]) /
                             df["Expected_Revenue"]) * 100

df["Cost_Deviation"] = ((df["Costs"] - df["Expected_Costs"]) /
                          df["Expected_Costs"]) * 100
In [25]:
new_order = [
    'Product_type', 'SKU', 'Price', 'Availability', 'Number_of_products_sold',
    'Revenue_generated', 'Expected_Revenue', 'Revenue_Deviation',
    'Customer_demographics', 'Stock_levels', 'Lead_times', 'Order_quantities',
    'Shipping_times', 'Shipping_carriers',
    'Shipping_costs', 'Manufacturing_costs', 'Costs', 'Expected_Costs', 'Cost_Deviation',
    'Supplier_name', 'Location', 'Lead_time', 'Production_volumes',
    'Manufacturing_lead_time', 'Inspection_results', 'Defect_rates',
    'Transportation_modes', 'Routes', 'Date'
]

df = df[new_order]
In [26]:
df.head()
Out[26]:
Product_type SKU Price Availability Number_of_products_sold Revenue_generated Expected_Revenue Revenue_Deviation Customer_demographics Stock_levels ... Supplier_name Location Lead_time Production_volumes Manufacturing_lead_time Inspection_results Defect_rates Transportation_modes Routes Date
0 fragrance Woody Eau de Parfum 1.699976 39 868 9866.465458 1475.579180 568.650357 Non-binary 57 ... Supplier B Mumbai 6 656 3 Fail 8.923234 Rail Route E 2018-01-01
1 healthcare Multivitamin Tablets 75.202244 87 809 8920.095776 60838.615236 -85.338102 Unknown 58 ... Supplier D Delhi 23 975 21 Pass 1.547675 Rail Route B 2018-01-01
2 bodycare Exfoliating Soap 22.916503 90 832 6777.144443 19066.530896 -64.455283 Female 59 ... Supplier E Chennai 30 296 22 Pending 3.216068 Air Route C 2018-01-02
3 bodycare Coconut Body Butter 80.065417 75 167 5711.616781 13370.924563 -57.283307 Male 0 ... Supplier A Bangalore 26 529 25 Pass 1.449842 Road Route E 2018-01-02
4 bodycare Exfoliating Soap 74.364683 100 400 7338.061362 29745.873061 -75.330825 Male 31 ... Supplier D Hyderabad 25 985 6 Fail 2.068435 Air Route J 2018-01-02

5 rows × 29 columns

In [27]:
outliers = df[(df["Revenue_Deviation"].abs() > 10) | (df["Cost_Deviation"].abs() > 10)]
In [28]:
No_Rows = outliers.shape[0]
print("No_Rows =", No_Rows)
No_Rows = 430
In [29]:
outlier_indices = outliers.index

df = df.drop(outlier_indices)
In [30]:
No_Rows = df.shape[0]
No_Columns = df.shape[1]

print("No_Rows =", No_Rows, "\nNo_Columns =", No_Columns)
No_Rows = 9570 
No_Columns = 29
In [31]:
EDA_Columns = ['Product_type',
 'Customer_demographics',
 'Shipping_carriers',
 'Supplier_name',
 'Location',
 'Inspection_results',
 'Transportation_modes',
 'Routes',]

for col in EDA_Columns :
    print(df[col].value_counts())
    print('-----------------*************--------------------')
    print('-----------------*************--------------------')
Product_type
healthcare    1039
grooming       980
fragrance      969
bodycare       969
cosmetics      968
skincare       964
babycare       941
wellness       931
haircare       918
nailcare       891
Name: count, dtype: Int64
-----------------*************--------------------
-----------------*************--------------------
Customer_demographics
Female        2441
Unknown       2415
Male          2396
Non-binary    2318
Name: count, dtype: Int64
-----------------*************--------------------
-----------------*************--------------------
Shipping_carriers
FedEx        1984
UPS          1904
Blue Dart    1901
Delhivery    1891
DHL          1890
Name: count, dtype: Int64
-----------------*************--------------------
-----------------*************--------------------
Supplier_name
Supplier D    1935
Supplier B    1931
Supplier A    1929
Supplier E    1893
Supplier C    1882
Name: count, dtype: Int64
-----------------*************--------------------
-----------------*************--------------------
Location
Hyderabad    1256
Mumbai       1207
Delhi        1203
Pune         1199
Kolkata      1190
Bangalore    1174
Ahmedabad    1174
Chennai      1167
Name: count, dtype: Int64
-----------------*************--------------------
-----------------*************--------------------
Inspection_results
Pass       3221
Fail       3177
Pending    3172
Name: count, dtype: Int64
-----------------*************--------------------
-----------------*************--------------------
Transportation_modes
Air     2449
Sea     2422
Rail    2353
Road    2346
Name: count, dtype: Int64
-----------------*************--------------------
-----------------*************--------------------
Routes
Route B    1004
Route G     975
Route A     970
Route H     966
Route C     960
Route F     956
Route J     946
Route E     937
Route D     931
Route I     925
Name: count, dtype: Int64
-----------------*************--------------------
-----------------*************--------------------
In [32]:
df["Net_profit"] = df["Expected_Revenue"] - df["Costs"]
In [33]:
df["Delay"] = df["Lead_times"] - (df["Lead_time"] + df["Shipping_times"]) 
In [35]:
df.to_csv("supply_chain_cleaned.csv", index=False)
In [34]:
df["Delay"] = df["Delay"].clip(lower=0)
In [35]:
df = df.round(2)
In [ ]:
 
In [ ]:
 
In [ ]:
 

Week2¶

المجموعة 1: التحليل الوصفي للبيانات¶

In [36]:
# Sort by Number_of_products_sold and get the top 10
top_selling_products = df.sort_values('Number_of_products_sold', ascending=False).head(10)



plt.figure(figsize=(10,6))
ax = sns.barplot(x='Number_of_products_sold', y='SKU', data=top_selling_products, palette="Blues_d")
plt.title("Top Selling Products by Number of Products Sold", fontsize=16)
plt.xlabel('Number of Products Sold', fontsize=12)
plt.ylabel('SKU', fontsize=12)

# Display the values inside the bars
for container in ax.containers:
    ax.bar_label(container, padding=3, fontsize=10,label_type='center', color='white')

plt.show()
No description has been provided for this image
In [37]:
# Get the product with the highest revenue
highest_revenue_product = df.loc[df['Revenue_generated'].idxmax()]

# Display the product with highest revenue
print(highest_revenue_product[['SKU', 'Revenue_generated']])
SKU                  Green Tea Pack
Revenue_generated         103648.26
Name: 8234, dtype: object
In [38]:
# Get the SKU with the highest net profit
highest_profit_sku = df.loc[df['Net_profit'].idxmax()]

# Display the SKU with the highest net profit
print(highest_profit_sku[['SKU', 'Net_profit']])
SKU           Aromatherapy Candle
Net_profit               98771.12
Name: 4556, dtype: object
In [39]:
# Calculate average price per Product_type
avg_price_per_type = df.groupby('Product_type')['Price'].mean().sort_values(ascending=False).round(2)

# Plot average price per product type
plt.figure(figsize=(10,6))
ax = avg_price_per_type.plot(kind='bar', color='teal')
plt.title('Average Price per Product Type', fontsize=16)
plt.xlabel('Product Type', fontsize=12)
plt.ylabel('Average Price', fontsize=12)
plt.xticks(rotation=45, ha='right')

# Display the values inside the bars
for container in ax.containers:
    ax.bar_label(container, padding=3, fontsize=10)

plt.show()
No description has been provided for this image
In [40]:
# Calculate net profit per product type
net_profit_per_type = df.groupby('Product_type')['Net_profit'].sum().sort_values(ascending=False)



# Plot net profit per product type as horizontal bar chart
plt.figure(figsize=(10,6))
ax = net_profit_per_type.plot(kind='barh', color='green')
plt.title('Net Profit per Product Type', fontsize=16)
plt.xlabel('Net Profit', fontsize=12)
plt.ylabel('Product Type', fontsize=12)

# Display the values inside the bars (in the middle of each bar)
ax.bar_label(ax.containers[0], fmt='{:,.0f}', fontsize=10, color='white', label_type='center')

plt.show()
No description has been provided for this image
In [41]:
# Calculate the revenue difference
df['Revenue_diff'] = df['Expected_Revenue'] - df['Revenue_generated']

# Plot the difference in revenue for each product
plt.figure(figsize=(10,6))
ax = sns.barplot(x='Revenue_diff', y='SKU', data=df.sort_values('Revenue_diff', ascending=False).head(10), palette="coolwarm")
plt.title('Revenue Difference: Expected vs Generated', fontsize=16)
plt.xlabel('Revenue Difference', fontsize=12)
plt.ylabel('SKU', fontsize=12)

# Display the values inside the bars
for container in ax.containers:
    ax.bar_label(container, padding=3, fontsize=10,label_type='center', color='black')

plt.show()
No description has been provided for this image
In [42]:
# Sort by Defect_rates and get the top 10 products
top_defect_products = df.sort_values('Defect_rates', ascending=False).head(10)

# Plot products with the highest defect rates
plt.figure(figsize=(10,6))
ax = sns.barplot(x='Defect_rates', y='SKU', data=top_defect_products, palette="Reds_d")
plt.title('Top Products with Highest Defect Rates', fontsize=16)
plt.xlabel('Defect Rates', fontsize=12)
plt.ylabel('SKU', fontsize=12)

# Display the values inside the bars
for container in ax.containers:
    ax.bar_label(container, padding=3, fontsize=10,label_type='center', color='white')

plt.show()
No description has been provided for this image
In [43]:
data = df.groupby("Product_type")["Stock_levels"].mean().reset_index()
data["Stock_levels"] = data["Stock_levels"].round(0)  # أو round(1) لو عايز رقم عشري واحد
data = data.sort_values("Stock_levels", ascending=False)

ax = sns.barplot(
    data = data,
    y="Product_type",
    x="Stock_levels",
    palette="viridis"
)
plt.xticks(rotation=45)
plt.title("Average Stock Levels per Product type")

# Display the values inside the bars
for container in ax.containers:
    ax.bar_label(container, padding=3, fontsize=10,label_type='center', color='black')

plt.show()
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 

المجموعة 2: التحليل بين المتغيرات¶

In [44]:
# Plot the relationship between Price and Number_of_products_sold
plt.figure(figsize=(10,6))
sns.scatterplot(x='Price', y='Number_of_products_sold', data=df, color='purple')
plt.title('Price vs Number of Products Sold', fontsize=16)
plt.xlabel('Price', fontsize=12)
plt.ylabel('Number of Products Sold', fontsize=12)
plt.show()
No description has been provided for this image
In [45]:
# Plot the relationship between Order_quantities and Production_volumes
plt.figure(figsize=(10,6))
sns.scatterplot(x='Order_quantities', y='Production_volumes', data=df, color='orange')
plt.title('Order Quantities vs Production Volumes', fontsize=16)
plt.xlabel('Order Quantities', fontsize=12)
plt.ylabel('Production Volumes', fontsize=12)
plt.show()
No description has been provided for this image
In [46]:
# Plot the relationship between Lead_time and Manufacturing_lead_time
plt.figure(figsize=(10,6))
sns.scatterplot(x='Lead_time', y='Manufacturing_lead_time', data=df, color='blue')
plt.title('Lead Time vs Manufacturing Lead Time', fontsize=16)
plt.xlabel('Lead Time', fontsize=12)
plt.ylabel('Manufacturing Lead Time', fontsize=12)
plt.show()
No description has been provided for this image
In [47]:
# Plot the relationship between Shipping_costs and Net_profit
plt.figure(figsize=(10,6))
sns.scatterplot(x='Shipping_costs', y='Net_profit', data=df, color='green')
plt.title('Shipping Costs vs Net Profit', fontsize=16)
plt.xlabel('Shipping Costs', fontsize=12)
plt.ylabel('Net Profit', fontsize=12)
plt.show()
No description has been provided for this image
In [48]:
# Plot the relationship between Manufacturing_costs and Net_profit
plt.figure(figsize=(10,6))
sns.scatterplot(x='Manufacturing_costs', y='Net_profit', data=df, color='red')
plt.title('Manufacturing Costs vs Net Profit', fontsize=16)
plt.xlabel('Manufacturing Costs', fontsize=12)
plt.ylabel('Net Profit', fontsize=12)
plt.show()
No description has been provided for this image
In [49]:
# Plot the relationship between Shipping_times and Net_profit
plt.figure(figsize=(10,6))
sns.scatterplot(x='Shipping_times', y='Net_profit', data=df, color='cyan')
plt.title('Shipping Times vs Net Profit', fontsize=16)
plt.xlabel('Shipping Times', fontsize=12)
plt.ylabel('Net Profit', fontsize=12)
plt.show()
No description has been provided for this image
In [50]:
# Plot the relationship between Defect_rates and Revenue_Deviation
plt.figure(figsize=(10,6))
sns.scatterplot(x='Defect_rates', y='Revenue_Deviation', data=df, color='brown')
plt.title('Defect Rates vs Revenue Deviation', fontsize=16)
plt.xlabel('Defect Rates', fontsize=12)
plt.ylabel('Revenue Deviation', fontsize=12)
plt.show()
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]:
 

المجموعة 3: التحليل الزمني¶

In [51]:
# تحويل التاريخ لسنة
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year

# تجميع الإيراد لكل سنة ولكل نوع منتج
yearly_sales = df.groupby(['Year', 'Product_type'], as_index=False)['Revenue_generated'].sum()

# حساب الإيراد بالمليون
yearly_sales['Total_Revenue'] = yearly_sales['Revenue_generated'] / 1_000_000

# رسم العمود
fig = px.bar(
    yearly_sales,
    x='Year',
    y='Revenue_generated',
    color='Product_type',
    title='Annual Sales by Product Type',
    labels={'Year': 'Year', 'Revenue_generated': 'Total Revenue'},
    custom_data=['Product_type', 'Total_Revenue']
)

# تخصيص الـ tooltip
fig.update_traces(
    hovertemplate=(
        'Product: %{customdata[0]}<br>'
        'Year: %{x}<br>'
        'Total_Revenue: %{customdata[1]:.2f}M<br>'
        '<extra></extra>'
    )
)

fig.update_layout(barmode='stack')

fig.show()
In [52]:
# Group by Year and Product_type to calculate annual revenue
Yearly_revenue = df.groupby(['Year'])['Revenue_generated'].sum().reset_index()

Yearly_revenue['Total_Revenue'] = Yearly_revenue['Revenue_generated'] / 1_000_000


# Plot seasonal trend of Revenue Generated
fig = px.line(Yearly_revenue, x='Year', y='Revenue_generated', markers=True, 
              title='Annual Sales by Product Type', labels={'Total_Revenue': 'Revenue Generated'})

# Add customdata for tooltip
fig.update_traces(
    customdata=Yearly_revenue[['Total_Revenue']],
    hovertemplate=(
        'Year: %{x}<br>'
        'Total_Revenue: %{customdata[0]:.2f}M<br>'
        '<extra></extra>'
    )
)

fig.show()
In [53]:
# تأكد إن Date في شكل datetime
df['Date'] = pd.to_datetime(df['Date'])

# استخراج Year و Month
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month

# Group by Year and Product_type, calculate total revenue for each year
monthly_sales = df.groupby(['Year', 'Month', 'Product_type'])['Revenue_generated'].sum().reset_index()

# Get max & min revenue rows per product
idx_max = monthly_sales.groupby('Product_type')['Revenue_generated'].idxmax()
idx_min = monthly_sales.groupby('Product_type')['Revenue_generated'].idxmin()

max_rows = monthly_sales.loc[idx_max].copy()
max_rows['Extreme'] = 'Max revenue'

min_rows = monthly_sales.loc[idx_min].copy()
min_rows['Extreme'] = 'Min revenue'

extreme_sales = pd.concat([max_rows, min_rows], ignore_index=True)

# Convert month number to name
extreme_sales['Month'] = extreme_sales['Month'].apply(lambda m: calendar.month_name[m])

# Convert revenue to k
extreme_sales['Revenue_k'] = extreme_sales['Revenue_generated'] / 1000

# Line chart
fig = px.line(
    extreme_sales,
    x='Product_type',
    y='Revenue_generated',
    color='Extreme',
    markers=True,
    title='Max & Min Month Revenue per Product Type',
    labels={'Revenue_generated': 'Revenue Generated', 'Product_type': 'Product Type'},
    custom_data=['Product_type', 'Year', 'Month', 'Revenue_k']
)

# Tooltip
fig.update_traces(
    hovertemplate=(
        'Product: %{customdata[0]}<br>'
        'Year: %{customdata[1]}<br>'
        'Month: %{customdata[2]}<br>'
        'Revenue: %{customdata[3]:.2f}k<br>'
        '<extra></extra>'
    )
)

# Clean legend title
fig.update_layout(legend_title_text='')

fig.show()
In [54]:
# التأكد من أن التاريخ datetime
df['Date'] = pd.to_datetime(df['Date'])

# استخراج رقم واسم الشهر
df['Month'] = df['Date'].dt.month
df['MonthName'] = df['Date'].dt.month_name()

# إجمالي العائد لكل Product في كل شهر عبر السنين
monthly_sales = (
    df.groupby(['Product_type', 'Month', 'MonthName'], as_index=False)['Revenue_generated']
      .sum()
      .sort_values(['Product_type', 'Month'])
)

# تحديد أعلى وأقل شهر لكل منتج
idx_max = monthly_sales.groupby('Product_type')['Revenue_generated'].idxmax()
idx_min = monthly_sales.groupby('Product_type')['Revenue_generated'].idxmin()

max_rows = monthly_sales.loc[idx_max].copy()
max_rows['Extreme'] = 'Max revenue'

min_rows = monthly_sales.loc[idx_min].copy()
min_rows['Extreme'] = 'Min revenue'

extreme_sales = pd.concat([max_rows, min_rows], ignore_index=True)

# إضافة Revenue 
extreme_sales['Revenue_M'] = extreme_sales['Revenue_generated'] / 1_000_000


# رسم Bar Chart
fig = px.bar(
    extreme_sales,
    x='Product_type',
    y='Revenue_generated',
    color='Extreme',
    barmode='group',   # يخلي الماكس والـ مين جنب بعض لكل منتج
    title='Max & Min Monthly Revenue per Product (Across All Years)',
    labels={'Product_type': 'Product Type', 'Revenue_generated': 'Revenue'},
    custom_data=['Product_type', 'MonthName', 'Revenue_M']
)

# تخصيص الـ tooltip
fig.update_traces(
    hovertemplate=(
        'Product: %{customdata[0]}<br>'
        'Month: %{customdata[1]}<br>'
        'Revenue: %{customdata[2]:.2f}M<br>'
        '<extra></extra>'
    )
)

fig.update_layout(legend_title_text='')

fig.show()
In [55]:
# Group by year to analyze the trend in Order_quantities
Yearly_order_quantity = df.groupby(['Year'])['Order_quantities'].sum().reset_index()

# Plot seasonal trend of Order Quantities
fig = px.line(Yearly_order_quantity, x='Year', y='Order_quantities', markers=True, 
              title='Trend of Order Quantities', labels={'Order_quantities': 'Order Quantities'})
fig.show()
No description has been provided for this image
In [56]:
# Ensure Date is datetime
df['Date'] = pd.to_datetime(df['Date'])

# Extract Year
df['Year'] = df['Date'].dt.year

# Compute total orders per Product_type per Year
yearly_product_orders = df.groupby(['Year', 'Product_type'], as_index=False)['Order_quantities'].sum()

# Get max and min per year
idx_max = yearly_product_orders.groupby('Year')['Order_quantities'].idxmax()
idx_min = yearly_product_orders.groupby('Year')['Order_quantities'].idxmin()

max_rows = yearly_product_orders.loc[idx_max].copy()
max_rows['Type'] = 'Max ordered'

min_rows = yearly_product_orders.loc[idx_min].copy()
min_rows['Type'] = 'Min ordered'

# If max and min are same product in a year → remove duplicate
extreme_orders = pd.concat([max_rows, min_rows], ignore_index=True)
extreme_orders = extreme_orders.drop_duplicates(subset=['Year', 'Product_type', 'Order_quantities'])

# Create line chart
fig = px.line(
    extreme_orders,
    x='Year',
    y='Order_quantities',
    color='Type',
    markers=True,
    title='Max & Min Ordered Product per Year',
    labels={'Order_quantities': 'Order Quantity'},
    custom_data=['Product_type']
)

fig.update_traces(
    hovertemplate=
    'Year: %{x}<br>' +
    'Product: %{customdata[0]}<br>' +
    'Orders: %{y}K<br>' +
    '<extra></extra>'
)

fig.show()
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 

المجموعة 4: تحليل الموردين¶

In [57]:
# Group by Supplier_name to find the supplier with the highest order quantities
supplier_order_quantities = df.groupby('Supplier_name')['Order_quantities'].sum().reset_index()

# Get the supplier with the highest order quantities
top_supplier_order = supplier_order_quantities.loc[supplier_order_quantities['Order_quantities'].idxmax()]

# Display the result
print(top_supplier_order)
Supplier_name       Supplier A
Order_quantities         96153
Name: 0, dtype: object
In [58]:
# Group by Supplier_name to calculate the supplier with the highest net profit
supplier_net_profit = df.groupby('Supplier_name')['Net_profit'].sum().reset_index()

# Get the supplier with the highest net profit
top_supplier_profit = supplier_net_profit.loc[supplier_net_profit['Net_profit'].idxmax()]

# Display the result
print(top_supplier_profit)
Supplier_name     Supplier D
Net_profit       44586432.77
Name: 3, dtype: object
In [59]:
# Group by Supplier_name to find the supplier causing the most delay
supplier_delay = df.groupby('Supplier_name')['Delay'].sum().reset_index()

# Get the supplier with the highest delay
top_supplier_delay = supplier_delay.loc[supplier_delay['Delay'].idxmax()]

# Display the result
print(top_supplier_delay)
Supplier_name    Supplier A
Delay                  1917
Name: 0, dtype: object
In [60]:
# Group by Supplier_name to find the supplier with the highest revenue generated
supplier_revenue = df.groupby('Supplier_name')['Revenue_generated'].sum().reset_index()

# Get the supplier with the highest revenue
top_supplier_revenue = supplier_revenue.loc[supplier_revenue['Revenue_generated'].idxmax()]

# Display the result
print(top_supplier_revenue)
Supplier_name        Supplier D
Revenue_generated    44659527.9
Name: 3, dtype: object
In [61]:
# Group by Supplier Location and calculate the average Lead_time for each location
location_lead_time = df.groupby('Location')['Lead_time'].mean().reset_index()

# Plot using Plotly
fig = px.bar(location_lead_time, x='Location', y='Lead_time', 
             title='Impact of Supplier Location on Lead Time', labels={'Lead_time': 'Lead Time'})
fig.show()
No description has been provided for this image
In [62]:
# Group by Supplier_name to compare Expected_Costs with Manufacturing_costs
supplier_cost_comparison = df.groupby('Supplier_name')[['Expected_Costs', 'Costs']].sum().reset_index()

# Plot using Plotly
fig = px.bar(supplier_cost_comparison, x='Supplier_name', y=['Expected_Costs', 'Costs'], barmode='group',
             title='Comparison of Expected Costs and Costs by Supplier')
fig.show()
No description has been provided for this image
In [63]:
# Group by Supplier_name and Shipping_carriers to find the effect of Shipping_carriers on Shipping_times
carrier_shipping_times = df.groupby(['Supplier_name', 'Shipping_carriers'])['Shipping_times'].mean().reset_index()

# Plot using Plotly
fig = px.bar(carrier_shipping_times, x='Supplier_name', y='Shipping_times', color='Shipping_carriers', 
             title='Effect of Shipping Carriers on Shipping Times by Supplier')
fig.show()
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]:
 

المجموعة 5: تحليل العملاء (Customer Analysis)¶

In [64]:
# إجمالي الإيراد لكل فئة
rev = df.groupby("Customer_demographics")["Revenue_generated"].sum()

# تجهيز القيم بصيغة M
labels = [f"{i} ({v/1_000_000:.2f}M)" for i, v in rev.items()]

# ألوان Seaborn
colors = sns.color_palette("Blues", len(rev))

plt.figure(figsize=(8,9))
plt.pie(rev, labels=labels, autopct='%1.1f%%', colors=colors, startangle=120)
plt.title("Revenue by Customer Demographics")
plt.axis('equal')
plt.show()
No description has been provided for this image
In [65]:
avg_sold_demo = df.groupby("Customer_demographics")["Number_of_products_sold"].mean().reset_index()
avg_sold_demo["Number_of_products_sold"] = avg_sold_demo["Number_of_products_sold"].round()
avg_sold_demo = avg_sold_demo.sort_values("Number_of_products_sold", ascending=False)


ax = sns.barplot(data=avg_sold_demo, x="Number_of_products_sold", y="Customer_demographics", palette="viridis")
plt.title("Average Products Sold per Customer Category")
plt.xticks(rotation=45)

# Display the values inside the bars
for container in ax.containers:
    ax.bar_label(container, padding=3, fontsize=10,label_type='center', color='black')

plt.show()
No description has been provided for this image
In [66]:
order_cust = df.groupby("Customer_demographics")["Order_quantities"].sum().reset_index()
order_cust = order_cust.sort_values("Order_quantities", ascending=False)


fig = px.bar(
    order_cust,
    x="Customer_demographics",
    y="Order_quantities",
    color="Customer_demographics",
    title="Order Quantities by Customer Demographics"
)
fig.show()
No description has been provided for this image
In [67]:
# حساب أعلى 3 منتجات لكل فئة
top3 = (df.groupby(["Customer_demographics","Product_type"])["Number_of_products_sold"].sum().reset_index())

top3 = top3.sort_values(["Customer_demographics","Number_of_products_sold"],ascending=[True, False])

top3 = top3.groupby("Customer_demographics").head(3)

# Plotly bar chart
fig = px.bar(
    top3,
    x="Product_type",
    y="Number_of_products_sold",
    color="Customer_demographics",
    barmode="group",
    title="Top 3 Preferred Products per Customer Category",
    labels={
        "Product_type": "Product",
        "Number_of_products_sold": "Products Sold",
        "Customer_demographics": "Customer Group"
    }
)

fig.update_layout(
    xaxis=dict(tickangle=45),
    height=600
)

fig.show()
No description has been provided for this image
In [68]:
prod_demo = (df.groupby(["Product_type", "Customer_demographics"])["Number_of_products_sold"].sum().reset_index())

# أعلى عميل لكل منتج
max_rows = prod_demo.loc[prod_demo.groupby("Product_type")["Number_of_products_sold"].idxmax()].copy()
max_rows["Type"] = "Max Customer"

# أقل عميل لكل منتج
min_rows = prod_demo.loc[prod_demo.groupby("Product_type")["Number_of_products_sold"].idxmin()].copy()
min_rows["Type"] = "Min Customer"

# دمج الاثنين
extreme_customers = pd.concat([max_rows, min_rows], ignore_index=True)

extreme_customers["Units_k"] = extreme_customers["Number_of_products_sold"] / 1000

# رسم Bar Chart
fig = px.bar(
    extreme_customers,
    x="Product_type",
    y="Number_of_products_sold",
    color="Type",
    barmode="group",
    title="Highest & Lowest Buying Customer per Product",
    labels={
        "Product_type": "Product",
        "Number_of_products_sold": "Units Sold",
    },
    custom_data=["Customer_demographics", "Units_k"]   # ← مهم جداً
)

# تعديل الـ hover ليظهر نوع العميل داخل المربع
fig.update_traces(
    hovertemplate=
    'Type: %{fullData.name}<br>' +
    'Customer: %{customdata[0]}<br>' +    # ← نوع العميل
    'Units Sold: %{customdata[1]:.2f}k<br>' +
    '<extra></extra>'
)

# تحسين التنسيق
fig.update_layout(
    xaxis=dict(tickangle=45),
    height=650
)

fig.show()
No description has been provided for this image
In [69]:
sns.boxplot(data=df, x="Defect_rates", y="Customer_demographics", palette="Set2")
plt.title("Defect Rates Across Customer Demographic Groups")
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [70]:
counts_by_demo = (df[df["Defect_rates"] > 6].groupby("Customer_demographics")["Defect_rates"].count().reset_index())
counts_by_demo = counts_by_demo.sort_values("Defect_rates", ascending=False)


ax = sns.barplot(
    data=counts_by_demo,
    x="Defect_rates",
    y="Customer_demographics",
    palette="viridis"
)

plt.title("Count of over Defect Rates by Customer Category")
plt.xlabel("Count")

# Display the values inside the bars
for container in ax.containers:
    ax.bar_label(container, padding=3, fontsize=10,label_type='center', color='black')

plt.show()
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]:
 

المجموعة 6: تحليل الشحن (Shipping Analysis)¶

In [71]:
ax = df.groupby("Shipping_carriers")["Shipping_times"].mean().sort_values().plot(
    kind="bar", figsize=(10,5), color="teal"
)

plt.title("Fastest Shipping Carrier (Lower is Better)")
plt.ylabel("Average Shipping Time")

# أسهل طريقة لإظهار الأرقام
ax.bar_label(ax.containers[0], fmt="%.0f")
plt.show()
No description has been provided for this image
In [72]:
sns.boxplot(data=df, y="Transportation_modes", x="Shipping_times", palette="coolwarm")
plt.title("Shipping Times by Transportation Mode")
plt.show()
No description has been provided for this image
In [73]:
# حساب متوسط وقت الشحن حسب نتيجة الفحص
grouped = (
    df.groupby(["Inspection_results"])["Shipping_times"]
      .mean()
      .reset_index()
)


color_map = {
    "Pass": "green",
    "Fail": "red",
    "Pending": "steelblue"   # تقدر تغيّره لأي لون
}

# رسم Grouped Bar Chart
fig = px.bar(
    grouped,
    x="Inspection_results",
    y="Shipping_times",
    color="Inspection_results",    # يعمل grouping تلقائي
    title="Shipping Times by Inspection Results",
    labels={
        "Inspection_results": "Result",
        "Shipping_times": "Avg Shipping Time",
    },color_discrete_map=color_map  
)

# تعديل الـ hover لعرض المعلومات المهمة بس
fig.update_traces(
    hovertemplate=(
        "Result: %{fullData.name}<br>"
        "Avg Shipping Time: %{y:.0f}<extra></extra>"
    )
)

fig.update_layout(
    xaxis=dict(tickangle=45),
    height=650
)

fig.show()
No description has been provided for this image
In [74]:
# عدد الشحنات لكل Mode حسب نتيجة الفحص
count_data = (
    df.groupby(["Transportation_modes", "Inspection_results"], as_index=False)
      .size()
      .rename(columns={"size": "Count"})
)

fig = px.bar(
    count_data,
    x="Transportation_modes",
    y="Count",
    color="Inspection_results",
    barmode="group",
    title="Count of Shipments per Transportation Mode by Inspection Result",
    labels={
        "Transportation_modes": "Transportation Mode",
        "Count": "Number of Shipments",
        "Inspection_results": "Inspection Result"
    },
    # ألوان مخصصة
    color_discrete_map={
        "Pass": "green",
        "Fail": "red",
        "Pending": "steelblue"
    }
)

# Hover بسيط وواضح
fig.update_traces(
    hovertemplate=(
        "Transportation Mode: %{x}<br>"
        "Inspection Result: %{fullData.name}<br>"
        "Count: %{y}<extra></extra>"
    )
)

fig.update_layout(xaxis_tickangle=45, height=500)
fig.show()
No description has been provided for this image
In [75]:
# نجمع بيانات الشحن لكل Mode + Inspection Result
agg_data = (
    df.groupby(["Transportation_modes", "Inspection_results"])["Shipping_times"]
      .agg(Avg_shipping_time="mean", Count="size")   # المتوسط + عدد الشحنات
      .reset_index()
)

fig = px.bar(
    agg_data,
    x="Transportation_modes",
    y="Avg_shipping_time",          # متوسط وقت الشحن
    color="Inspection_results",
    barmode="group",
    title="Average Shipping Time per Transportation Mode by Inspection Result",
    labels={
        "Transportation_modes": "Transportation Mode",
        "Avg_shipping_time": "Avg Shipping Time",
        "Inspection_results": "Inspection Result"
    },
    color_discrete_map={
        "Pass": "green",
        "Fail": "red",
        "Pending": "steelblue"
    }
)

# نضيف الـ Count في الـ hover
fig.update_traces(
    hovertemplate=(
        "Transportation Mode: %{x}<br>"
        "Inspection Result: %{fullData.name}<br>"
        "Avg Shipping Time: %{y:.2f} days<br>"
        "<extra></extra>"
    ),
    customdata=agg_data[["Count"]].values
)

fig.update_layout(xaxis_tickangle=45, height=600)
fig.show()
No description has been provided for this image
In [76]:
# نرتب البيانات حسب delay تصاعديًا أو تنازليًا
data = (
    df.groupby("Routes")["Delay"]
      .sum()
      .reset_index()
      .sort_values("Delay", ascending=False)   # ↓ غيّرها لـ True لو عايز العكس
)

fig = px.bar(
    data,
    x="Routes",
    y="Delay",
    color="Delay",
    title="Delay per Route",
)

fig.update_layout(xaxis_tickangle=45, height=500)
fig.show()
No description has been provided for this image
In [77]:
D_routes = df.groupby("Routes")["Defect_rates"].mean().reset_index().sort_values("Defect_rates", ascending= False)

fig = px.bar(
    D_routes,
    x="Routes",
    y="Defect_rates",
    color="Defect_rates",
    title="Average Defect Rates per Route",
    labels={
        "Routes": "Routes",
        "Defect_rates": "Defect rates"
    }
)

fig.update_traces(
    hovertemplate=(
        "Routes: %{x}<br>"
        "Defect rates: %{y:.2f}%<br>"
        "<extra></extra>"
    )
)

fig.update_layout(xaxis_tickangle=45, height=600)
fig.show()
No description has been provided for this image
In [78]:
dev_cost = (
    df.groupby("Routes")["Cost_Deviation"]
      .sum()
      .reset_index()
      .sort_values("Cost_Deviation", ascending=True)
)

fig = px.bar(
    dev_cost,
    x="Cost_Deviation",
    y="Routes",
    color="Cost_Deviation",
    title="Cost Deviation per Route",
    labels={
        "Routes": "Routes",
        "Cost_Deviation": "Deviation (%)"
    },
    orientation='h'   # مهم جداً للـ bar الأفقي
)

# تعديل الـ hover للظهور الصحيح
fig.update_traces(
    hovertemplate=
        "Route: %{y}<br>"           # ← route
        "Deviation: %{x:.2f}%<br>"   # ← deviation
        "<extra></extra>"
)

fig.update_layout(height=500)
fig.show()
No description has been provided for this image
In [79]:
carrier = df.groupby("Shipping_carriers")["Shipping_costs"].sum().reset_index().sort_values("Shipping_costs", ascending=False)


fig = px.bar(
    carrier,
    x="Shipping_carriers",
    y="Shipping_costs",
    color="Shipping_costs",
    title="Total Shipping Costs per Carrier",
    labels={
        "Shipping_carriers": "Shipping carriers",
        "Shipping_costs": "Total cost"
    }
)


fig.update_traces(
    hovertemplate=(
        "Shipping carriers: %{x}<br>"
        "Total cost: %{y:.2f}k<br>"
        "<extra></extra>"
    )
)

fig.update_layout(xaxis_tickangle=45, height=600)
fig.show()
No description has been provided for this image
In [80]:
carrier = df.groupby("Shipping_carriers")["Shipping_costs"].mean().reset_index().sort_values("Shipping_costs", ascending=False)

fig = px.bar(
    carrier,
    x="Shipping_carriers",
    y="Shipping_costs",
    color="Shipping_costs",
    title="Average Shipping Costs per Carrier",
    labels={
        "Shipping_carriers": "Shipping carriers",
        "Shipping_costs": "AVG.Cost"
    }
)

fig.update_traces(
    hovertemplate=(
        "Shipping carriers: %{x}<br>"
        "AVG.Cost: %{y:.2f}$<br>"
        "<extra></extra>"
    )
)

fig.update_layout(xaxis_tickangle=45, height=600)
fig.show()
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]:
 

المجموعة 7 — تحليل أداء المنتجات (Product Performance KPIs)¶

In [81]:
top_profit = df.groupby("SKU")["Net_profit"].sum().sort_values(ascending=False).head(10)

ax = top_profit.plot(kind="barh", figsize=(12,6), color='green')
ax.bar_label(ax.containers[0], fmt='{:,.0f}M', fontsize=10, color='white', label_type='center')

plt.title("Top 10 SKU by Net Profit")
plt.xlabel("Total Net Profit")
plt.ylabel("SKU")
plt.gca().invert_yaxis()
plt.show()
No description has been provided for this image
In [82]:
low_profit = df.groupby("SKU")["Net_profit"].sum().sort_values().head(5)

cmap = plt.cm.Reds
colors = cmap(np.linspace(1, 0.3, len(top_profit)))
ax = low_profit.plot(kind="barh", figsize=(12,5), color=colors)
ax.bar_label(ax.containers[0], fmt='{:,.0f}M', fontsize=10, color='White', label_type='center')

plt.title("Lowest 20 SKU by Net Profit")
plt.ylabel("SKU")
plt.xlabel("Total Net Profit")
plt.gca().invert_yaxis()
plt.show()
No description has been provided for this image
In [83]:
df["Profit_margin"] = df["Net_profit"] / df["Revenue_generated"]

top_margin = (
    df.groupby("SKU")["Profit_margin"]
    .mean()
    .sort_values(ascending=False)
    .head(3))

plt.figure(figsize=(12,5))
ax = top_margin.plot(kind="bar", color="teal")
ax.bar_label(ax.containers[0], fmt='{:,.3f}', fontsize=10, color='White', label_type='center')
plt.title("Top 10 SKUs by Profit Margin")
plt.xlabel("SKU")
plt.ylabel("Average Profit Margin")
plt.show()
No description has been provided for this image
In [84]:
df["Profit_margin"] = df["Net_profit"] / df["Revenue_generated"]

top_margin = (
    df.groupby("SKU")["Profit_margin"]
    .mean()
    .nsmallest(3)          # ← أقل 10 بدل أعلى 10
)

plt.figure(figsize=(12,5))
ax = top_margin.plot(kind="bar", color="teal")

ax.bar_label(
    ax.containers[0],
    fmt='{:,.3f}',
    fontsize=10,
    color='white',
    label_type='center'
)

plt.title("Bottom 10 SKUs by Profit Margin")
plt.xlabel("SKU")
plt.ylabel("Average Profit Margin")
plt.show()
No description has been provided for this image
In [85]:
df["ROI"] = df["Net_profit"] / df["Costs"]
roi_table = df.groupby("SKU")["ROI"].mean().sort_values(ascending=False).head(10)

fig = px.bar(roi_table, title="Top 10 SKUs by ROI")

fig.update_traces(
    hovertemplate=(
        "SKU: %{x}<br>"
        "Value: %{y:.0f}k<br>"
        "<extra></extra>" ))

fig.update_layout(xaxis_tickangle=45, height=600)

fig.show()
No description has been provided for this image
In [86]:
df["YearMonth"] = df["Date"].dt.to_period("M")

sku_lifecycle = df.groupby(["SKU", "YearMonth"])["Revenue_generated"].sum().reset_index()
sku_lifecycle["YearMonth"] = sku_lifecycle["YearMonth"].astype(str)


fig = px.line(sku_lifecycle, x="YearMonth", y="Revenue_generated", color="SKU",
              title="Product Lifecycle Trends")

fig.show()
No description has been provided for this image
In [87]:
# ---------- تأكد من تحويل التاريخ ----------
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")

# ---------- YearMonth كـ string / وعمود datetime لأول يوم في الشهر ----------
df["YearMonth"] = df["Date"].dt.to_period("M").astype(str)
df["YearMonth_dt"] = pd.to_datetime(df["YearMonth"] + "-01")

# ---------- تجميع ----------
sku_lifecycle = df.groupby(["SKU", "YearMonth", "YearMonth_dt"])["Revenue_generated"].sum().reset_index()
# رتّب للتأكد
sku_lifecycle = sku_lifecycle.sort_values(["SKU","YearMonth_dt"])
In [88]:
TOP_N = 10

top_skus = (sku_lifecycle.groupby("SKU")["Revenue_generated"]
            .sum().nlargest(TOP_N).index.tolist())

sku_top = sku_lifecycle[sku_lifecycle["SKU"].isin(top_skus)].copy()

fig = px.line(
    sku_top,
    x="YearMonth_dt",
    y="Revenue_generated",
    color="SKU",
    title=f"Top {TOP_N} SKUs - Revenue over time",
    labels={"YearMonth_dt": "Month", "Revenue_generated": "Revenue"}
)
fig.update_layout(legend=dict(orientation="v", x=1.02, y=1), height=500)
fig.show()
No description has been provided for this image
In [89]:
TOP_N = 8  # اختَر TOP_N لتقليل الفوضى

# نأخذ top SKUs حسب المجموع
top_skus = (sku_lifecycle.groupby("SKU")["Revenue_generated"]
            .sum().nlargest(TOP_N).index.tolist())

df_rm = sku_lifecycle[sku_lifecycle["SKU"].isin(top_skus)].copy()
df_rm = df_rm.sort_values(["SKU","YearMonth_dt"])

# متوسط متحرك 3 شهور
df_rm["Revenue_rolling3"] = df_rm.groupby("SKU")["Revenue_generated"].transform(lambda x: x.rolling(3, min_periods=1).mean())

fig = px.line(
    df_rm,
    x="YearMonth_dt",
    y="Revenue_rolling3",
    color="SKU",
    title=f"Top {TOP_N} SKUs - 3-month rolling average",
    labels={"YearMonth_dt":"Month","Revenue_rolling3":"Revenue (3-mo avg)"}
)
fig.update_layout(height=520)
fig.show()
No description has been provided for this image
In [90]:
TOP_N = 20  # عدد الخيارات في الـ dropdown 

top_skus = (sku_lifecycle.groupby("SKU")["Revenue_generated"]
            .sum().nlargest(TOP_N).index.tolist())

df_dd = sku_lifecycle[sku_lifecycle["SKU"].isin(top_skus)].copy()
df_dd["Revenue_generated"] = df_dd["Revenue_generated"].round(0)

# نبني traces لكل SKU (مخفيين افتراضياً) ثم نضيف أزرار
fig = go.Figure()
for sku in top_skus:
    df_s = df_dd[df_dd["SKU"]==sku]
    fig.add_trace(go.Scatter(
        x=df_s["YearMonth_dt"],
        y=df_s["Revenue_generated"],
        mode="lines+markers",
        name=sku,
        visible=False
    ))

# أول واحد يظهر افتراضياً
if len(fig.data) > 0:
    fig.data[0].visible = True

# أزرار واحدة لكل SKU
buttons = []
for i, sku in enumerate(top_skus):
    vis = [False] * len(top_skus)
    vis[i] = True
    buttons.append(dict(
        label=sku,
        method="update",
        args=[{"visible": vis},
              {"title": f"Revenue for {sku}"}]
    ))

fig.update_layout(
    updatemenus=[dict(active=0, buttons=buttons, x=1.05, y=1)],
    title="Select SKU to display",
    xaxis_title="Month",
    yaxis_title="Revenue",
    height=600
)

fig.show()
No description has been provided for this image
In [91]:
TOP_N = 10 # عدد المربعات (facets) 

top_skus = (sku_lifecycle.groupby("SKU")["Revenue_generated"]
            .sum().nlargest(TOP_N).index.tolist())

df_facet = sku_lifecycle[sku_lifecycle["SKU"].isin(top_skus)].copy()
df_facet["Revenue_generated"] = df_facet["Revenue_generated"].round(0)


fig = px.line(
    df_facet,
    x="YearMonth_dt",
    y="Revenue_generated",
    facet_col="SKU",
    facet_col_wrap=2,   # عدد الأعمدة في الشبكة
    title=f"Top {TOP_N} SKUs",
    labels={"YearMonth_dt":"Month","Revenue_generated":"Revenue"}
)
fig.update_layout(height=800)
# تحسين مسافات المحاور لو لزم
fig.for_each_xaxis(lambda ax: ax.update(tickformat="%Y-%m"))
fig.show()
No description has been provided for this image
In [92]:
TOP_N = 10  # عدد المربعات (facets)

# أقل 10 SKUs حسب إجمالي الإيراد
top_skus = (sku_lifecycle.groupby("SKU")["Revenue_generated"]
            .sum().nsmallest(TOP_N).index.tolist())

df_facet = sku_lifecycle[sku_lifecycle["SKU"].isin(top_skus)].copy()
df_facet["Revenue_generated"] = df_facet["Revenue_generated"].round(0)


fig = px.line(
    df_facet,
    x="YearMonth_dt",
    y="Revenue_generated",
    facet_col="SKU",
    facet_col_wrap=2,   # عدد الأعمدة في الشبكة
    title=f"Lowest {TOP_N} SKUs",
    labels={"YearMonth_dt":"Month","Revenue_generated":"Revenue"}
)

fig.update_layout(height=800)

fig.for_each_xaxis(lambda ax: ax.update(tickformat="%Y-%m"))

fig.show()
No description has been provided for this image
In [93]:
df["Original_Row"] = df.index
df["Issue_flag"] = (df["Revenue_generated"] < df["Revenue_generated"].median()) & \
                   (df["Defect_rates"] > df["Defect_rates"].median())

problematic_products = df[df["Issue_flag"] == True]

problematic_df = problematic_products[["Original_Row", "SKU", "Product_type", "Revenue_generated", "Defect_rates"]]

problematic_df.head()
Out[93]:
Original_Row SKU Product_type Revenue_generated Defect_rates
430 430 Relaxation Balm wellness 3796.42 2.67
435 435 Zinc Tablets healthcare 10751.01 3.77
439 439 Dry Shampoo haircare 3989.70 2.72
441 441 Calcium Supplement healthcare 11069.63 2.70
446 446 Oud Intense fragrance 9340.68 3.55
In [94]:
df["Cost_to_Revenue"] = df["Costs"] / df["Revenue_generated"]

cost_heavy = df.groupby("SKU")["Cost_to_Revenue"].mean().sort_values(ascending=False).head(15)

plt.figure(figsize=(10,5))
ax = cost_heavy.plot(kind="bar", color="orange")
ax.bar_label(ax.containers[0], fmt='{:,.2f}', fontsize=10, color='black', label_type='edge')

plt.title("Products with Highest Cost-to-Revenue Ratio")
plt.ylabel("Cost / Revenue")
plt.show()
No description has been provided for this image
In [95]:
df["CTR"] = df["Costs"] / df["Revenue_generated"]

ctr_sorted = (
    df.groupby("SKU")["CTR"]
    .mean()
    .sort_values(ascending=False))

fig = px.line(
    ctr_sorted.cumsum() / ctr_sorted.sum(),
    title="Pareto Analysis for Cost-to-Revenue Ratio")

fig.update_traces(mode="lines+markers")
fig.show()
No description has been provided for this image
In [96]:
ctr_cat = df.groupby("Product_type")["Costs"].mean().sort_values(ascending=False)

values = ctr_cat.values
cum = np.cumsum(values) / values.sum()

fig, ax1 = plt.subplots(figsize=(12,6))
ax1.bar(ctr_cat.index, values, color='orange')

ax2 = ax1.twinx()
ax2.plot(ctr_cat.index, cum, color='blue', marker='o')

plt.title("Category-level Pareto Chart")
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]:
 

المجموعة 8 — تحليل سلوك السوق (Market Behavior & Customer Value)¶

In [97]:
aov = df.groupby("Customer_demographics")[["Revenue_generated", "Order_quantities"]].sum()
aov["AOV"] = aov["Revenue_generated"] / aov["Order_quantities"]
aov["AOV"] = aov["AOV"].round(0)

aov_df = aov[["AOV"]].reset_index()
aov_df
Out[97]:
Customer_demographics AOV
0 Female 443.0
1 Male 462.0
2 Non-binary 497.0
3 Unknown 478.0
In [98]:
profit_by_customer = df.groupby("Customer_demographics")["Net_profit"].sum().sort_values(ascending=False)

ax = profit_by_customer.plot(kind="barh", figsize=(10,5), title="Net Profit by Customer Segment")
ax.bar_label(ax.containers[0], fmt='{:,.0f}M', fontsize=10, color='black', label_type='center')

plt.show()
No description has been provided for this image
In [99]:
df["High_Value_Flag"] = df["Revenue_generated"] > df["Revenue_generated"].quantile(0.90)

vip_customers = df[df["High_Value_Flag"] == True]

vip_customers[["Customer_demographics", "SKU", "Revenue_generated"]].head()
Out[99]:
Customer_demographics SKU Revenue_generated
442 Non-binary Dry Shampoo 60466.00
445 Male Shaving Foam 58591.15
470 Unknown Immune Booster 101428.73
490 Unknown Baby Soap 55366.27
497 Non-binary Protein Shake 78379.75
In [100]:
affinity = df.pivot_table(
    index="Customer_demographics",
    columns="Product_type",
    values="Number_of_products_sold",
    aggfunc="sum")

affinity_percent = affinity.div(affinity.sum(axis=1), axis=0) * 10

plt.figure(figsize=(12,6))
ax = sns.heatmap(
    affinity_percent,
    cmap="YlGnBu",
    annot=True,
    fmt=".1f" )

plt.title("Product Affinity by Customer Demographics (%)")
plt.ylabel("Customer Demographics")
plt.xlabel("Product Type")
plt.show()
No description has been provided for this image
In [101]:
profit_by_customer = (
    df.groupby("Location")["Net_profit"]
    .sum()
    .sort_values(ascending=False).head(5)
)

cum_profit = profit_by_customer.cumsum() / profit_by_customer.sum()

fig, ax1 = plt.subplots(figsize=(12,5))
ax1.bar(profit_by_customer.index, profit_by_customer, color="teal")

ax2 = ax1.twinx()
ax2.plot(profit_by_customer.index, cum_profit, color="red", marker="o")

plt.title("Customer Profit Contribution (Pareto)")
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]:
 

المجموعة 9: التحليل المتقدم – التنبؤ والأنماط¶

In [102]:
df["Date"] = pd.to_datetime(df["Date"])
df["YearMonth"] = df["Date"].dt.to_period("M").astype(str)

monthly_ship = df.groupby("YearMonth")["Shipping_times"].mean().reset_index()
monthly_ship = monthly_ship.round(0)

fig = px.line(monthly_ship, x="YearMonth", y="Shipping_times", markers=True,
              title="Monthly Trend of Shipping Times")
fig.update_layout(xaxis_tickangle=-45)
fig.show()
No description has been provided for this image
In [103]:
pm_corr = df[["Production_volumes", "Manufacturing_costs"]].corr()

sns.heatmap(pm_corr, annot=True, cmap="coolwarm")
plt.title("Correlation Between Production Volume and Manufacturing Costs")
plt.show()
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]:
 

المجموعة 10: تحليل الاتجاهات الزمنية (Trends & Seasonality)¶

In [104]:
df["Month"] = df["Date"].dt.month
df["Year"] = df["Date"].dt.year

season_pivot = df.pivot_table(
    index="Month",
    columns="Year",
    values="Order_quantities",
    aggfunc="sum"
)
plt.figure(figsize=(12,6))
sns.heatmap(season_pivot, cmap="YlGnBu", annot=True,fmt=".0f")
plt.title("Seasonality of Order Quantities")
plt.show()
No description has been provided for this image
In [105]:
monthly_mlt = df.groupby("Month")["Manufacturing_lead_time"].mean().reset_index()
monthly_mlt = monthly_mlt.round()


fig = px.line(
    monthly_mlt,
    x="Month",
    y="Manufacturing_lead_time",
    markers=True,
    title="Monthly Trend of Manufacturing Lead Time"
)
fig.show()
No description has been provided for this image
In [106]:
df["Quarter"] = df["Date"].dt.quarter

quarter_stock = df.groupby("Quarter")["Stock_levels"].mean().reset_index()

plt.plot(quarter_stock["Quarter"], quarter_stock["Stock_levels"], marker="o")
plt.title("Quarterly Change in Stock Levels")
plt.xlabel("Quarter")
plt.ylabel("Average Stock Level")
plt.show()
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]:
 

Week3¶

Revenue Forecast¶

In [107]:
# تأكد من العمود Date
df['Date'] = pd.to_datetime(df['Date'])

# عمود YearMonth للاستخدام الزمني
df['YearMonth'] = df['Date'].dt.to_period('M').astype(str)
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Quarter'] = df['Date'].dt.to_period('Q').astype(str)
In [108]:
monthly_total = (
    df.groupby('YearMonth')['Revenue_generated']
      .sum()
      .reset_index()
      .sort_values('YearMonth')
)

monthly_total['YearMonth_dt'] = pd.to_datetime(monthly_total['YearMonth'])

fig = px.line(
    monthly_total,
    x='YearMonth_dt',
    y='Revenue_generated',
    title="Total Monthly Revenue Over Time",
    labels={
        'YearMonth_dt': 'Month',
        'Revenue_generated': 'Total Revenue'
    },
    markers=True, 
    template="plotly_white"
)

fig.update_layout(
    xaxis_title='Month',
    yaxis_title='Total Revenue',
    template="plotly_white",
    hovermode="x unified",
    height=500,
    width=900
)

fig.update_traces(hovertemplate="%{y:,.0f}")

fig.show()
No description has been provided for this image
In [109]:
data_rev = monthly_total.copy()

# مؤشر زمني بسيط
data_rev['t'] = np.arange(len(data_rev))

# Features time-based
data_rev['year'] = data_rev['YearMonth_dt'].dt.year
data_rev['month'] = data_rev['YearMonth_dt'].dt.month

# Lag feature – Revenue of previous month
data_rev['rev_lag1'] = data_rev['Revenue_generated'].shift(1)
data_rev['rev_ma3']  = data_rev['Revenue_generated'].rolling(window=3).mean()

# شيل أول كام صف فيهم NaN بسبب shift/rolling
data_rev = data_rev.dropna().reset_index(drop=True)

X = data_rev[['t', 'year', 'month', 'rev_lag1', 'rev_ma3']]
y = data_rev['Revenue_generated']
In [110]:
# بدون Shuffle – تقسيم زمني
split_index = int(len(data_rev) * 0.8)

X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]
In [111]:
# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Random Forest
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

def eval_model(y_true, y_pred, name="Model"):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)  # MSE فقط
    rmse = mse ** 0.5    
    r2 = r2_score(y_true, y_pred)
    print(f"{name} -> MAE: {mae:.2f}, RMSE: {rmse:.2f}, R²: {r2:.3f}")

eval_model(y_test, y_pred_lr, "Linear Regression (Total Revenue)")
eval_model(y_test, y_pred_rf, "Random Forest (Total Revenue)")
Linear Regression (Total Revenue) -> MAE: 202145.83, RMSE: 237532.97, R²: 0.491
Random Forest (Total Revenue) -> MAE: 211787.23, RMSE: 237128.61, R²: 0.493
In [112]:
test_period = data_rev.iloc[split_index:].copy()
test_period['Predicted_LR'] = y_pred_lr
test_period['Predicted_RF'] = y_pred_rf
In [113]:
fig = go.Figure()

# Actual Revenue
fig.add_trace(go.Scatter(
    x=test_period['YearMonth_dt'],
    y=test_period['Revenue_generated'],
    mode='lines+markers',
    name='Actual',
    marker=dict(size=8)
))

# Predicted – Random Forest
fig.add_trace(go.Scatter(
    x=test_period['YearMonth_dt'],
    y=test_period['Predicted_RF'],
    mode='lines+markers',
    name='Predicted RF',
    marker=dict(size=8, symbol='x')
))

fig.update_layout(
    title="Total Revenue – Actual vs Predicted (Random Forest)",
    xaxis_title="Month",
    yaxis_title="Revenue",
    template="plotly_white",
    hovermode="x unified",
    height=500,
    width=900
)

fig.update_traces(hovertemplate="%{y:,.0f}M")

fig.show()
No description has been provided for this image
In [114]:
# آخر صف في الداتا
last_row = data_rev.iloc[-1].copy()

future_rows = []
n_future = 3

for i in range(1, n_future+1):
    new = {}
    new['t'] = last_row['t'] + i
    # تقدير سنة وشهر بناءً على آخر تاريخ
    future_date = last_row['YearMonth_dt'] + pd.DateOffset(months=i)
    new['YearMonth_dt'] = future_date
    new['year'] = future_date.year
    new['month'] = future_date.month
    
    # نستخدم آخر قيم متاحة كـ lag/ma (بشكل مبسط)
    new['rev_lag1'] = last_row['Revenue_generated']
    new['rev_ma3']  = last_row['rev_ma3']  # ممكن تطورها لاحقًا
    
    future_rows.append(new)

future_df = pd.DataFrame(future_rows)

X_future = future_df[['t', 'year', 'month', 'rev_lag1', 'rev_ma3']]
future_df['Predicted_Revenue_RF'] = rf.predict(X_future)

Futere_Revenue = future_df[['YearMonth_dt', 'Predicted_Revenue_RF']]
Futere_Revenue['Predicted_Revenue_RF'] = Futere_Revenue['Predicted_Revenue_RF'].apply(lambda x: f"{x:,.0f}")
Futere_Revenue
Out[114]:
YearMonth_dt Predicted_Revenue_RF
0 2025-01-01 2,975,634
1 2025-02-01 2,974,895
2 2025-03-01 2,961,530
In [115]:
rev_type = (
    df.groupby(['YearMonth', 'Product_type'])['Revenue_generated']
      .sum()
      .reset_index()
)

rev_type['YearMonth_dt'] = pd.to_datetime(rev_type['YearMonth'])
rev_type = rev_type.sort_values(['Product_type', 'YearMonth_dt'])
In [116]:
product = "wellness"   # غيّر النوع على حسب الداتا عندك

rev_p = rev_type[rev_type['Product_type'] == product].copy()

rev_p['t'] = np.arange(len(rev_p))
rev_p['year'] = rev_p['YearMonth_dt'].dt.year
rev_p['month'] = rev_p['YearMonth_dt'].dt.month

rev_p['rev_lag1'] = rev_p['Revenue_generated'].shift(1)
rev_p['rev_ma3']  = rev_p['Revenue_generated'].rolling(window=3).mean()
rev_p = rev_p.dropna().reset_index(drop=True)

X_p = rev_p[['t', 'year', 'month', 'rev_lag1', 'rev_ma3']]
y_p = rev_p['Revenue_generated']

split_idx_p = int(len(rev_p) * 0.8)
X_train_p, X_test_p = X_p.iloc[:split_idx_p], X_p.iloc[split_idx_p:]
y_train_p, y_test_p = y_p.iloc[:split_idx_p], y_p.iloc[split_idx_p:]

rf_p = RandomForestRegressor(n_estimators=200, random_state=42)
rf_p.fit(X_train_p, y_train_p)
y_pred_p = rf_p.predict(X_test_p)

eval_model(y_test_p, y_pred_p, f"RF – Product_type = {product}")
RF – Product_type = wellness -> MAE: 86013.98, RMSE: 96678.11, R²: 0.062
In [117]:
test_period_p = rev_p.iloc[split_idx_p:].copy() 
test_period_p['Predicted_RF'] = y_pred_p

fig = go.Figure()
# Actual
fig.add_trace(go.Scatter(
    x=test_period_p['YearMonth_dt'],
    y=test_period_p['Revenue_generated'],
    mode='lines+markers',
    name='Actual'
))

# Predicted
fig.add_trace(go.Scatter(
    x=test_period_p['YearMonth_dt'],
    y=test_period_p['Predicted_RF'],
    mode='lines+markers',
    name='Predicted',
    marker=dict(symbol='x')
))

fig.update_layout(
    title=f"Revenue Forecast – Product_type = {product}",
    xaxis_title="Month",
    yaxis_title="Revenue",
    template="plotly_white",
    hovermode="x unified",
    height=500,
    width=900
)
fig.update_traces(hovertemplate="%{y:,.0f}")
fig.show()
No description has been provided for this image
In [118]:
rev_sku = (
    df.groupby(['YearMonth', 'SKU'])['Revenue_generated']
      .sum()
      .reset_index()
)
rev_sku['YearMonth_dt'] = pd.to_datetime(rev_sku['YearMonth'])
rev_sku = rev_sku.sort_values(['SKU', 'YearMonth_dt'])
In [119]:
sku_id = "Relaxation Balm"  

rev_s = rev_sku[rev_sku['SKU'] == sku_id].copy()

if len(rev_s) < 8:
    print("Data not enough for this SKU – choose another one")
else:
    rev_s['t'] = np.arange(len(rev_s))
    rev_s['year'] = rev_s['YearMonth_dt'].dt.year
    rev_s['month'] = rev_s['YearMonth_dt'].dt.month

    rev_s['rev_lag1'] = rev_s['Revenue_generated'].shift(1)
    rev_s['rev_ma3']  = rev_s['Revenue_generated'].rolling(window=3).mean()
    rev_s = rev_s.dropna().reset_index(drop=True)

    X_s = rev_s[['t', 'year', 'month', 'rev_lag1', 'rev_ma3']]
    y_s = rev_s['Revenue_generated']

    split_idx_s = int(len(rev_s) * 0.8)
    X_train_s, X_test_s = X_s.iloc[:split_idx_s], X_s.iloc[split_idx_s:]
    y_train_s, y_test_s = y_s.iloc[:split_idx_s], y_s.iloc[split_idx_s:]

    rf_s = RandomForestRegressor(n_estimators=200, random_state=42)
    rf_s.fit(X_train_s, y_train_s)
    y_pred_s = rf_s.predict(X_test_s)

    eval_model(y_test_s, y_pred_s, f"RF – SKU = {sku_id}")
RF – SKU = Relaxation Balm -> MAE: 17754.40, RMSE: 20614.32, R²: 0.087
In [120]:
test_period_s = rev_s.iloc[split_idx_s:].copy()
test_period_s['Predicted_RF'] = y_pred_s

fig = go.Figure()
# Actual
fig.add_trace(go.Scatter(
    x=test_period_p['YearMonth_dt'],
    y=test_period_p['Revenue_generated'],
    mode='lines+markers',
    name='Actual'
))

# Predicted
fig.add_trace(go.Scatter(
    x=test_period_p['YearMonth_dt'],
    y=test_period_p['Predicted_RF'],
    mode='lines+markers',
    name='Predicted',
    marker=dict(symbol='x')
))

fig.update_layout(
    title=f"Revenue Forecast – SKU = {sku_id}",
    xaxis_title="Month",
    yaxis_title="Revenue",
    template="plotly_white",
    hovermode="x unified",
    height=500,
    width=900
)

fig.update_traces(hovertemplate="%{y:,.0f}")

fig.show()
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]:
 

Demand Forecast¶

In [121]:
monthly_demand  = (
    df.groupby('YearMonth')['Number_of_products_sold']
      .sum()
      .reset_index()
      .sort_values('YearMonth')
)

monthly_demand['YearMonth_dt'] = pd.to_datetime(monthly_demand['YearMonth'])

fig = px.line(
    monthly_demand,
    x='YearMonth_dt',
    y='Number_of_products_sold',
    title="Monthly Demand Trend (Products Sold)",
    labels={
        'YearMonth_dt': 'Month',
        'Number_of_products_sold': 'Total sold products'
    },
    markers=True, 
    template="plotly_white"
)

fig.update_layout(
    xaxis_title='Month',
    yaxis_title='Total sold products',
    template="plotly_white",
    hovermode="x unified",
    height=500,
    width=900
)
fig.update_traces(hovertemplate="%{y:,.0f}")
fig.show()
No description has been provided for this image
In [122]:
demand_df = monthly_demand.copy()

demand_df['t'] = np.arange(len(demand_df))
demand_df['year'] = demand_df['YearMonth_dt'].dt.year
demand_df['month'] = demand_df['YearMonth_dt'].dt.month

demand_df['lag1'] = demand_df['Number_of_products_sold'].shift(1)
demand_df['ma3']  = demand_df['Number_of_products_sold'].rolling(window=3).mean()

demand_df = demand_df.dropna().reset_index(drop=True)

X = demand_df[['t','year','month','lag1','ma3']]
y = demand_df['Number_of_products_sold']
In [123]:
split_idx = int(len(demand_df) * 0.8)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
In [124]:
# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Random Forest
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

def eval_model(y_true, y_pred, name):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)  # MSE فقط
    rmse = mse ** 0.5 
    r2 = r2_score(y_true, y_pred)
    print(f"{name} -> MAE={mae:.2f}, RMSE={rmse:.2f}, R2={r2:.3f}")

eval_model(y_test, y_pred_lr, "LR Demand Model")
eval_model(y_test, y_pred_rf, "RF Demand Model")
LR Demand Model -> MAE=3245.89, RMSE=4132.01, R2=0.422
RF Demand Model -> MAE=3198.01, RMSE=3695.10, R2=0.537
In [128]:
test_period = demand_df.iloc[split_idx:].copy()
test_period['Predicted_RF'] = y_pred_rf

fig = go.Figure()

# Actual Revenue
fig.add_trace(go.Scatter(
    x=test_period['YearMonth_dt'],
    y=test_period['Number_of_products_sold'],
    mode='lines+markers',
    name='Actual',
    marker=dict(size=8)
))

# Predicted – Random Forest
fig.add_trace(go.Scatter(
    x=test_period['YearMonth_dt'],
    y=test_period['Predicted_RF'],
    mode='lines+markers',
    name='Predicted RF',
    marker=dict(size=8, symbol='x')
))

fig.update_layout(
    title="Demand Forecast – Actual vs Predicted",
    xaxis_title="Month",
    yaxis_title="Total sold products",
    template="plotly_white",
    hovermode="x unified",
    height=500,
    width=900
)
fig.update_traces(hovertemplate="%{y:,.0f}")
fig.show()
No description has been provided for this image
In [129]:
last = demand_df.iloc[-1]
future_rows = []
n_future = 6

for i in range(1, n_future+1):
    new = {}
    new['t'] = last['t'] + i
    future_date = last['YearMonth_dt'] + pd.DateOffset(months=i)
    new['YearMonth_dt'] = future_date
    new['year'] = future_date.year
    new['month'] = future_date.month
    new['lag1'] = last['Number_of_products_sold']
    new['ma3'] = last['ma3']
    future_rows.append(new)

future_df = pd.DataFrame(future_rows)

X_future = future_df[['t','year','month','lag1','ma3']]
future_df['Predicted_Demand_RF'] = rf.predict(X_future)

Futere_Demand = future_df[['YearMonth_dt','Predicted_Demand_RF']]
Futere_Demand['Predicted_Demand_RF'] = Futere_Demand['Predicted_Demand_RF'].apply(lambda x: f"{x:,.0f}")
Futere_Demand
Out[129]:
YearMonth_dt Predicted_Demand_RF
0 2025-01-01 59,690
1 2025-02-01 59,667
2 2025-03-01 59,370
3 2025-04-01 59,508
4 2025-05-01 59,432
5 2025-06-01 59,782
In [132]:
monthly_orders = (
    df.groupby('YearMonth')['Order_quantities']
      .sum()
      .reset_index()
      .sort_values('YearMonth')
)

monthly_orders['YearMonth_dt'] = pd.to_datetime(monthly_orders['YearMonth'])

fig = px.line(
    monthly_orders,
    x='YearMonth_dt',
    y='Order_quantities',
    title="Monthly Order Quantities Trend",
    labels={
        'YearMonth_dt': 'Month',
        'Order_quantities': 'Total Orders'
    },
    markers=True, 
    template="plotly_white"
)

fig.update_layout(
    xaxis_title='Month',
    yaxis_title='Total Orders',
)

fig.show()
No description has been provided for this image
In [133]:
order_df = monthly_orders.copy()
order_df['t'] = np.arange(len(order_df))
order_df['year'] = order_df['YearMonth_dt'].dt.year
order_df['month'] = order_df['YearMonth_dt'].dt.month

order_df['lag1'] = order_df['Order_quantities'].shift(1)
order_df['ma3']  = order_df['Order_quantities'].rolling(window=3).mean()

order_df = order_df.dropna().reset_index(drop=True)

Xo = order_df[['t','year','month','lag1','ma3']]
yo = order_df['Order_quantities']
In [134]:
split_idx_o = int(len(order_df) * 0.8)
Xo_train, Xo_test = Xo.iloc[:split_idx_o], Xo.iloc[split_idx_o:]
yo_train, yo_test = yo.iloc[:split_idx_o], yo.iloc[split_idx_o:]

rf_o = RandomForestRegressor(n_estimators=200, random_state=42)
rf_o.fit(Xo_train, yo_train)

yo_pred = rf_o.predict(Xo_test)

eval_model(yo_test, yo_pred, "RF Order Quantity Forecast")
RF Order Quantity Forecast -> MAE=497.60, RMSE=573.21, R2=0.207
In [135]:
last_o = order_df.iloc[-1]
future_rows = []
n_future = 6

for i in range(1, n_future++1):
    new = {}
    future_date = last_o['YearMonth_dt'] + pd.DateOffset(months=i)
    new['t'] = last_o['t'] + i
    new['YearMonth_dt'] = future_date
    new['year'] = future_date.year
    new['month'] = future_date.month
    new['lag1'] = last_o['Order_quantities']
    new['ma3'] = last_o['ma3']
    future_rows.append(new)

future_orders = pd.DataFrame(future_rows)
future_orders['Predicted_Order_RF'] = rf_o.predict(future_orders[['t','year','month','lag1','ma3']])

Futere_Orders = future_orders[['YearMonth_dt','Predicted_Order_RF']]
Futere_Orders['Predicted_Order_RF'] = Futere_Orders['Predicted_Order_RF'].apply(lambda x: f"{x:,.0f}")
Futere_Orders
Out[135]:
YearMonth_dt Predicted_Order_RF
0 2025-01-01 5,818
1 2025-02-01 5,682
2 2025-03-01 5,738
3 2025-04-01 5,771
4 2025-05-01 5,754
5 2025-06-01 5,755
In [ ]:
 
In [ ]:
 
In [ ]:
 

Profit Forecast¶

In [136]:
df['Date'] = pd.to_datetime(df['Date'])
df['YearMonth'] = df['Date'].dt.to_period('M').astype(str)

df = df.dropna(subset=['Net_profit'])
In [137]:
# تجميع شهري: ربح، إيراد، تكلفة، متوسط defect
monthly_profit = (
    df.groupby('YearMonth')
      .agg({
          'Net_profit': 'sum',
          'Revenue_generated': 'sum',
          'Costs': 'sum',
          'Defect_rates': 'mean'
      })
      .reset_index()
      .sort_values('YearMonth')
)

monthly_profit['YearMonth_dt'] = pd.to_datetime(monthly_profit['YearMonth'])
In [138]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=monthly_profit['YearMonth_dt'],
    y=monthly_profit['Net_profit'],
    mode='lines+markers',
    name='Net Profit'
))

fig.update_layout(
    title="Total Monthly Net Profit Over Time",
    xaxis_title="Month",
    yaxis_title="Net Profit",
    template="plotly_white",
    hovermode="x unified",
    height=500,
    width=900
)

fig.update_traces(hovertemplate="%{y:,.0f}")

fig.show()
No description has been provided for this image
In [139]:
profit_df = monthly_profit.copy()

# مؤشر زمني
profit_df['t'] = np.arange(len(profit_df))
profit_df['year'] = profit_df['YearMonth_dt'].dt.year
profit_df['month'] = profit_df['YearMonth_dt'].dt.month

# Lag & Moving Average على الربح
profit_df['profit_lag1'] = profit_df['Net_profit'].shift(1)
profit_df['profit_ma3']  = profit_df['Net_profit'].rolling(window=3).mean()

# ممكن تستخدم كمان Revenue & Costs كـ Features
# لو حابب تبسط، سيبهم – لو حابب تزود قوة الموديل، ضيفهم
features = ['t', 'year', 'month', 'profit_lag1', 'profit_ma3',
            'Revenue_generated', 'Costs', 'Defect_rates']

profit_df = profit_df.dropna().reset_index(drop=True)

X = profit_df[features]
y = profit_df['Net_profit']
In [140]:
split_idx = int(len(profit_df) * 0.8)

X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
In [142]:
def eval_model(y_true, y_pred, name="Model"):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)  # MSE فقط
    rmse = mse ** 0.5 
    r2 = r2_score(y_true, y_pred)
    print(f"{name} -> MAE: {mae:.2f}, RMSE: {rmse:.2f}, R²: {r2:.3f}")

# Linear Regression
lr_p = LinearRegression()
lr_p.fit(X_train, y_train)
y_pred_lr = lr_p.predict(X_test)
eval_model(y_test, y_pred_lr, "Linear Regression (Net Profit)")

# Random Forest
rf_p = RandomForestRegressor(n_estimators=200, random_state=42)
rf_p.fit(X_train, y_train)
y_pred_rf = rf_p.predict(X_test)
eval_model(y_test, y_pred_rf, "Random Forest (Net Profit)")
Linear Regression (Net Profit) -> MAE: 7522.23, RMSE: 9409.54, R²: 0.999
Random Forest (Net Profit) -> MAE: 20161.49, RMSE: 31928.55, R²: 0.991
In [143]:
test_period_p = profit_df.iloc[split_idx:].copy()
test_period_p['Predicted_RF'] = y_pred_rf

fig = go.Figure()

# Actual Net Profit
fig.add_trace(go.Scatter(
    x=test_period_p['YearMonth_dt'],
    y=test_period_p['Net_profit'],
    mode='lines+markers',
    name='Actual'
))

# Predicted Net Profit (RF)
fig.add_trace(go.Scatter(
    x=test_period_p['YearMonth_dt'],
    y=test_period_p['Predicted_RF'],
    mode='lines+markers',
    name='Predicted RF',
    marker=dict(symbol='x')
))

fig.update_layout(
    title="Net Profit – Actual vs Predicted (Random Forest)",
    xaxis_title="Month",
    yaxis_title="Net Profit",
    template="plotly_white",
    hovermode="x unified",
    height=500,
    width=900
)
fig.update_traces(hovertemplate="%{y:,.0f}")
fig.show()
No description has been provided for this image
In [145]:
last_row = profit_df.iloc[-1].copy()

future_rows = []
n_future = 3  # زوّد الرقم لو عايز شهور أكتر

for i in range(1, n_future + 1):
    new = {}
    new['t'] = last_row['t'] + i
    future_date = last_row['YearMonth_dt'] + pd.DateOffset(months=i)
    new['YearMonth_dt'] = future_date
    new['year'] = future_date.year
    new['month'] = future_date.month
    
    # نستخدم آخر قيم معروفة كـ تقريب (تقدر تطور المنطق ده لو حابب)
    new['profit_lag1'] = last_row['Net_profit']
    new['profit_ma3']  = last_row['profit_ma3']
    new['Revenue_generated'] = last_row['Revenue_generated']
    new['Costs'] = last_row['Costs']
    new['Defect_rates'] = last_row['Defect_rates']
    
    future_rows.append(new)

future_profit = pd.DataFrame(future_rows)
X_future = future_profit[features]

future_profit['Predicted_Net_Profit_RF'] = rf_p.predict(X_future)

Futere_Profit = future_profit[['YearMonth_dt','Predicted_Net_Profit_RF']]
Futere_Profit['Predicted_Net_Profit_RF'] = Futere_Profit['Predicted_Net_Profit_RF'].apply(lambda x: f"{x:,.0f}")
Futere_Profit
Out[145]:
YearMonth_dt Predicted_Net_Profit_RF
0 2025-01-01 2,892,618
1 2025-02-01 2,892,618
2 2025-03-01 2,891,516
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: